Exploratory analysis and data visualization

In this section, use appropriate visualization techniques to explore the dataset and identify any patterns or relationships in the data.

# Convert discrete variables to factors
dat$gender <- as.factor(dat$gender)
dat$race <- as.factor(dat$race)
dat$smoking <- as.factor(dat$smoking)
dat$hypertension <- as.factor(dat$hypertension)
dat$diabetes <- as.factor(dat$diabetes)
dat$vaccine <- as.factor(dat$vaccine)
dat$severity <- as.factor(dat$severity)
dat$study <- as.factor(dat$study)

dat <- dat %>%
  select(-id) %>% 
  mutate(
    gender = case_when(
      dat$gender == 1 ~ "Male",
      TRUE ~ "Female"),
    race = case_when(
      dat$race == 1 ~ "White",
      dat$race == 2 ~ "Asian",
      dat$race == 3 ~ "Black",
      TRUE ~ "Hispanic"),
    smoking = case_when(
      dat$smoking == 0 ~ "Never Smoked",
      dat$smoking == 1 ~ "Former Smoker",
      TRUE ~ "Current Smoker")) %>% 
  mutate_if(is.character, as.factor) %>% 
  mutate(race = relevel(race, ref = "White"),
         smoking = relevel(smoking, ref = "Never Smoked"),
         study = relevel(study, ref = "B"))

# Continuous variables
continuous_vars <- c("age", "height", "weight", "bmi", "SBP", "LDL")

for (var in continuous_vars) {
  plot <- ggplot(dat, aes_string(x = var, y = "recovery_time")) +
    geom_point() +
    geom_smooth() +
    labs(x = var, y = "Time to Recovery (days)", title = paste(var, "vs. Time to Recovery")) +
    theme_classic()
  print(plot)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Discrete variables
discrete_vars <- c("gender", "race", "smoking", "hypertension", "diabetes", "vaccine", "severity", "study")

for (var in discrete_vars) {
  plot <- ggplot(dat, aes_string(x = var, y = "recovery_time")) +
    geom_boxplot() +
    labs(x = var, y = "Time to Recovery (days)", title = paste(var, "vs. Time to Recovery")) +
    theme_classic()
  print(plot)
}

dat_continuous <- dat %>% 
  select(c(age, height, weight, bmi, SBP, LDL))

corrplot(cor(dat_continuous), method = 'number', type = 'lower') 

# Variables to include in subset: age, height, weight, gender, race, smoking, hypertension, diabetes, vaccine, severity, study

# dat_subset <- dat %>% 
#   select(c(age, bmi, gender, race, smoking, hypertension, diabetes, vaccine, severity, study, recovery_time))

# dat_subset <- dat %>% 
#   select(c(age, height, weight, gender, race, smoking, hypertension, diabetes, vaccine, severity, study, recovery_time))

dat_subset <- dat %>% 
  select(c(height, weight, vaccine, severity, study, recovery_time))

# dat %>% ggplot(aes(x = age, y = recovery_time)) +
#   geom_jitter() + geom_smooth() + theme_classic()
# Full Data
set.seed(1)
trainIndex <- createDataPartition(dat$recovery_time, p = 0.8, list = FALSE)
train_data <- dat[trainIndex, ]
test_data <- dat[-trainIndex, ]

# Subset of Data
set.seed(1)
trainIndex_sub <- createDataPartition(dat_subset$recovery_time, p = 0.8, list = FALSE)
train_data_sub <- dat_subset[trainIndex, ]
test_data_sub <- dat_subset[-trainIndex, ]

ctrl_SE <- trainControl(method = "repeatedcv",
                      number = 10,
                      repeats = 5,
                      selectionFunction = "oneSE")

ctrl_best <- trainControl(method = "repeatedcv",
                      number = 10,
                      repeats = 5,
                      selectionFunction = "best")

x <- model.matrix(recovery_time ~ ., train_data)[, -1]
y <- train_data$recovery_time

x_sub <- model.matrix(recovery_time ~ ., train_data_sub)[, -1]
y_sub <- train_data_sub$recovery_time
set.seed(1)
# LASSO model Full
lasso.fit <- train(
  x = x,
  y = y,
  data = train_data,
  method = "glmnet",
  trControl = ctrl_SE,
  tuneGrid = expand.grid(alpha = 1, 
                         lambda = exp(seq(-6, -1, length = 100))),
  standardize = T
)

ggplot(lasso.fit, highlight = TRUE) +
  theme_classic()

best_lambda <- lasso.fit$bestTune$lambda
coef(lasso.fit$finalModel, lasso.fit$bestTune$lambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                                  s1
## (Intercept)           -840.04385489
## age                      0.16823163
## genderMale              -2.59075696
## raceAsian                3.50404233
## raceBlack                .         
## raceHispanic            -0.38081627
## smokingCurrent Smoker    2.97646686
## smokingFormer Smoker     1.72027649
## height                   4.86538187
## weight                  -5.41777863
## bmi                     17.43538534
## hypertension1            1.69648547
## diabetes1               -1.46759957
## SBP                      0.02233705
## LDL                     -0.02000477
## vaccine1                -6.58454252
## severity1                6.81026029
## studyA                  -4.75766922
set.seed(1)
# LASSO model Subset
lasso.fit.sub <- train(
  x = x_sub,
  y = y_sub,
  data = train_data_sub,
  method = "glmnet",
  trControl = ctrl_SE,
  tuneGrid = expand.grid(alpha = 1, 
                         lambda = exp(seq(-6, 2, length = 100))),
  standardize = T
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
ggplot(lasso.fit.sub, highlight = TRUE) +
  theme_classic()

best_lambda <- lasso.fit.sub$bestTune$lambda
coef(lasso.fit.sub$finalModel, lasso.fit.sub$bestTune$lambda)
## 6 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept) 88.9382606
## height      -0.3782789
## weight       0.2434927
## vaccine1    -2.5641618
## severity1    0.7450711
## studyA      -0.5830858
set.seed(1)
# Elastic Net Full
enet.fit <- train(x = x,
                  y = y,
                  data = train_data,
                  method = "glmnet",
                  tuneGrid = expand.grid(alpha = seq(0, 1, length = 21),
                                         lambda = exp(seq(-5, 3, length = 100))),
                  trControl = ctrl_best,
                  standardize = T)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
enet.fit$bestTune
##      alpha      lambda
## 1501  0.75 0.006737947
myCol <- rainbow(25)
myPar <- list(superpose.symbol = list(col = myCol),
              superpose.line = list(col = myCol)) 
plot(enet.fit, par.settings = myPar)

coef(enet.fit$finalModel, enet.fit$bestTune$lambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                                  s1
## (Intercept)           -2.002609e+03
## age                    1.790879e-01
## genderMale            -2.947692e+00
## raceAsian              3.768215e+00
## raceBlack             -4.163253e-01
## raceHispanic          -7.390642e-01
## smokingCurrent Smoker  3.556445e+00
## smokingFormer Smoker   2.179138e+00
## height                 1.172685e+01
## weight                -1.267781e+01
## bmi                    3.825160e+01
## hypertension1          1.903922e+00
## diabetes1             -1.662082e+00
## SBP                    2.140266e-02
## LDL                   -2.939422e-02
## vaccine1              -6.793896e+00
## severity1              7.273653e+00
## studyA                -4.878764e+00
set.seed(1)
# Elastic Net Subset
enet.fit.sub <- train(x = x_sub,
                  y = y_sub,
                  data = train_data_sub,
                  method = "glmnet",
                  tuneGrid = expand.grid(alpha = seq(0, 1, length = 21),
                                         lambda = exp(seq(-5, 5, length = 100))),
                  trControl = ctrl_best,
                  standardize = T)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
enet.fit.sub$bestTune
##    alpha   lambda
## 55     0 1.575457
myCol <- rainbow(25)
myPar <- list(superpose.symbol = list(col = myCol),
              superpose.line = list(col = myCol)) 
plot(enet.fit.sub, par.settings = myPar)

coef(enet.fit.sub$finalModel, enet.fit.sub$bestTune$lambda)
## 6 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 133.1787123
## height       -0.7679694
## weight        0.5738241
## vaccine1     -6.4862640
## severity1     7.1420582
## studyA       -4.8267005
set.seed(1)
# Ridge Full
ridge.fit <- train(x = x,
                   y = y,
                   data = train_data,
                   method = "glmnet",
                   tuneGrid = expand.grid(alpha = 0,
                                          lambda = exp(seq(10, -6, length=100))),
                   trControl = ctrl_best)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(ridge.fit, xTrans = log)

ridge.fit$bestTune
##    alpha    lambda
## 34     0 0.5134171
coef(ridge.fit$finalModel, s = ridge.fit$bestTune$lambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                                  s1
## (Intercept)           -101.92788195
## age                      0.19639986
## genderMale              -2.71878900
## raceAsian                4.01116555
## raceBlack               -0.15248392
## raceHispanic            -0.98352878
## smokingCurrent Smoker    3.36685579
## smokingFormer Smoker     1.91249121
## height                   0.49957702
## weight                  -0.78408717
## bmi                      4.11762834
## hypertension1            1.68868623
## diabetes1               -1.84842122
## SBP                      0.04152787
## LDL                     -0.02903930
## vaccine1                -6.71601476
## severity1                6.91370346
## studyA                  -4.97385571
set.seed(1)
# Ridge Subset
ridge.fit.sub <- train(x = x_sub,
                   y = y_sub,
                   data = train_data_sub,
                   method = "glmnet",
                   tuneGrid = expand.grid(alpha = 0,
                                          lambda = exp(seq(10, -6, length=100))),
                   trControl = ctrl_best)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(ridge.fit.sub, xTrans = log)

ridge.fit.sub$bestTune
##    alpha   lambda
## 41     0 1.591451
coef(ridge.fit.sub$finalModel, s = ridge.fit.sub$bestTune$lambda)
## 6 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 133.1115977
## height       -0.7673816
## weight        0.5733665
## vaccine1     -6.4823273
## severity1     7.1377216
## studyA       -4.8237001
set.seed(1)
# Partial Least Squares
pls.fit <- train(x = x,
                   y = y,
                   method = "pls",
                   tuneGrid = data.frame(ncomp = 1:15),
                   trControl = ctrl_best,
                   preProcess = c("center", "scale"))

ggplot(pls.fit, highlight = TRUE) + theme_classic()

set.seed(1)
# Partial Least Squares
pls.fit.sub <- train(x = x_sub,
                   y = y_sub,
                   method = "pls",
                   #tuneGrid = data.frame(ncomp = 1:12),
                   tuneGrid = data.frame(ncomp = 1:6),
                   trControl = ctrl_best,
                   preProcess = c("center", "scale"))

ggplot(pls.fit.sub, highlight = TRUE) + theme_classic()

mars_grid <- expand.grid(degree = 1:3, 
                         nprune = 2:15)

set.seed(1)
# MARS Full
mars.fit <- train(x = x,
                  y = y,
                 method = "earth",
                 tuneGrid = mars_grid,
                 metric = "RMSE",
                 trControl = ctrl_best)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## 
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
## 
##     rescale
## Loading required package: TeachingDemos
ggplot(mars.fit) + theme_classic()

mars.fit$bestTune
##    nprune degree
## 16      3      2
coef(mars.fit$finalModel)
##          (Intercept)          h(bmi-30.7) h(bmi-30.7) * studyA 
##             38.85227             27.39024            -20.91526
set.seed(1)
# MARS Sub
mars.fit.sub <- train(x = x_sub,
                  y = y_sub,
                 method = "earth",
                 tuneGrid = mars_grid,
                 metric = "RMSE",
                 trControl = ctrl_best)

ggplot(mars.fit.sub) + theme_classic()

mars.fit.sub$bestTune
##    nprune degree
## 41     14      3
coef(mars.fit.sub$finalModel)
##                               (Intercept) 
##                               36.54169610 
##                           h(height-159.6) 
##                               -0.45993611 
##                           h(159.6-height) 
##                               10.10337216 
##          h(159.6-height) * h(weight-81.3) 
##                                6.27108983 
##          h(159.6-height) * h(81.3-weight) 
##                               -0.21300902 
## h(159.6-height) * h(weight-81.3) * studyA 
##                               -5.89235302 
##                            h(weight-77.8) 
##                                1.08027270 
##          h(171.6-height) * h(weight-77.8) 
##                                0.53281219 
##                                  vaccine1 
##                               -6.58139206 
## h(171.6-height) * h(weight-77.8) * studyA 
##                               -0.40470585 
##                                 severity1 
##                                7.48014816 
##          h(height-159.6) * h(87.3-weight) 
##                                0.07525588 
##                  h(159.6-height) * studyA 
##                               -6.23583295
resamples <- resamples(list(lasso = lasso.fit, 
                            enet = enet.fit,
                            ridge = ridge.fit,
                            pls = pls.fit,
                            mars = mars.fit,
                            lasso.sub = lasso.fit.sub, 
                            enet.sub = enet.fit.sub,
                            ridge.sub = ridge.fit.sub,
                            pls.sub = pls.fit.sub,
                            mars.sub = mars.fit.sub))
bwplot(resamples, metric = "RMSE")

# png("plot_rmse.png", width = 800, height = 600, res = 150)  # Open the PNG graphics device
# bwplot(resamples, metric = "RMSE")  # Create your plot
# dev.off() 
# Prepare the test data for predictions
x_test <- model.matrix(recovery_time ~ ., test_data)[, -1]
x_test_sub <- model.matrix(recovery_time ~ ., test_data_sub)[, -1]


# Create a tibble to store the model names and test RMSE values
test_RMSE <- tibble(
  Model = c("LASSO", "Elastic Net", "Ridge", "PLS", "MARS",
            "LASSO (Subset)", "Elastic Net (Subset)", "Ridge (Subset)", "PLS (Subset)", "MARS (Subset)"),
  RMSE = c(
    postResample(predict(lasso.fit, newdata = x_test), test_data$recovery_time)[1],
    postResample(predict(enet.fit, newdata = x_test), test_data$recovery_time)[1],
    postResample(predict(ridge.fit, newdata = x_test), test_data$recovery_time)[1],
    postResample(predict(pls.fit, newdata = x_test), test_data$recovery_time)[1],
    postResample(predict(mars.fit, newdata = x_test), test_data$recovery_time)[1],
    postResample(predict(lasso.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
    postResample(predict(enet.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
    postResample(predict(ridge.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
    postResample(predict(pls.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
    postResample(predict(mars.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1]
  )
)

test_RMSE %>% arrange(RMSE)
## # A tibble: 10 × 2
##    Model                 RMSE
##    <chr>                <dbl>
##  1 MARS (Subset)         17.8
##  2 MARS                  17.9
##  3 PLS                   18.4
##  4 Elastic Net           18.4
##  5 LASSO                 19.1
##  6 Ridge                 19.8
##  7 Ridge (Subset)        20.4
##  8 Elastic Net (Subset)  20.4
##  9 PLS (Subset)          20.4
## 10 LASSO (Subset)        20.8